library(phyloseq); packageVersion("phyloseq")
## [1] '1.46.0'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.5.1'
library(readr); packageVersion("readr")
## [1] '2.1.5'
library(tidyr); packageVersion("tidyr")
## [1] '1.3.1'
library(purrr); packageVersion("purrr")
## [1] '1.0.2'
library(furrr); packageVersion("furrr")
## Loading required package: future
## Warning: package 'future' was built under R version 4.3.3
## [1] '0.3.1'
library(dplyr); packageVersion("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] '1.1.4'
library(stringr); packageVersion("stringr")
## [1] '1.5.1'
library(forcats); packageVersion("forcats")
## [1] '1.0.0'
library(metacoder); packageVersion("metacoder")
## This is metacoder version 0.3.7 (stable)
##
## Attaching package: 'metacoder'
## The following object is masked from 'package:ggplot2':
##
## map_data
## The following object is masked from 'package:phyloseq':
##
## filter_taxa
## [1] '0.3.7'
library(data.table); packageVersion("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## [1] '1.15.4'
library(decontam); packageVersion("decontam")
## [1] '1.22.0'
library(Biostrings); packageVersion("Biostrings")
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
##
## shift
## The following object is masked from 'package:metacoder':
##
## reverse
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.3
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:metacoder':
##
## complement
## The following object is masked from 'package:base':
##
## strsplit
## [1] '2.70.3'
library(magick); packageVersion("magick")
## Warning: package 'magick' was built under R version 4.3.3
## Linking to ImageMagick 6.9.12.93
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
## [1] '2.8.4'
library(vegan); packageVersion("vegan")
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-6.1
## [1] '2.6.6.1'
library(pdftools);packageVersion("pdftools")
## Warning: package 'pdftools' was built under R version 4.3.3
## Using poppler version 23.04.0
## [1] '3.4.1'
library(vegan); packageVersion("vegan")
## [1] '2.6.6.1'
library(grid)
##
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
knitr::opts_knit$set(root.dir = "~/benchmark_demulticoder")
seed <- 1
set.seed(seed)
asv_matrix_rps10<-read.csv("demulticoder/data/final_asv_abundance_matrix_rps10.csv")
asv_matrix_rps10$dada2_tax <- asv_matrix_rps10$dada2_tax <- gsub("Stramenopila", "Eukaryota--100--Domain;Stramenopila", asv_matrix_rps10$dada2_tax)
asv_matrix_rps10 <- asv_matrix_rps10[, -1]
colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)] <- gsub("_rps10$", "", colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)])
asv_matrix_its<-read.csv("demulticoder/data/final_asv_abundance_matrix_its.csv")
asv_matrix_its$dada2_tax <- gsub("Fungi", "Eukaryota--100--Domain;Fungi", asv_matrix_its$dada2_tax)
asv_matrix_its <- asv_matrix_its[, -1]
colnames(asv_matrix_its)[3:ncol(asv_matrix_rps10)] <- gsub("_its$", "", colnames(asv_matrix_its)[3:ncol(asv_matrix_its)])
#Let's combine these matrices
#For easier analysis, we previously combined the two matrices, and appended domain info to each one so we can make one heat tree for combined dataset
sample_cols_its <- setdiff(names(asv_matrix_its), c("sequence", "dada2_tax"))
sample_cols_rps10 <- setdiff(names(asv_matrix_rps10), c("sequence", "dada2_tax"))
if(!all(sample_cols_its == sample_cols_rps10)) {
stop("Sample columns do not match between ITS and RPS10 dataframes!")
}
abundance <- rbind(
asv_matrix_its[, c("sequence", "dada2_tax", sample_cols_its)], # ITS data
asv_matrix_rps10[, c("sequence", "dada2_tax", sample_cols_rps10)] # RPS10 data
)
write.csv(abundance, "demulticoder/data/final_asv_abundance_matrix_combined_demulticoder.csv")
metadata_path <- file.path("demulticoder/data/metadata.csv")
metadata <- read_csv(metadata_path)
## Rows: 174 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_name, well, organism, sample_type
## dbl (3): plate, path_conc, experiment
## lgl (2): flooded, is_ambiguous
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(metadata)
## # A tibble: 174 × 9
## sample_name plate well organism flooded path_conc experiment sample_type
## <chr> <dbl> <chr> <chr> <lgl> <dbl> <dbl> <chr>
## 1 S1 1 A01 Cry TRUE 100 1 Sample
## 2 S10 1 B02 Cry TRUE 1 1 Sample
## 3 S11 1 C02 Plu FALSE 1 1 Sample
## 4 S12 1 D02 Plu TRUE 1 1 Sample
## 5 S13 1 E02 Control TRUE 0 1 Sample
## 6 S14 1 F02 Cin TRUE 1 1 Sample
## 7 S15 1 H02 Cin FALSE 100 1 Sample
## 8 S16 1 A03 Cry FALSE 100 1 Sample
## 9 S17 1 B03 Cry TRUE 100 1 Sample
## 10 S18 1 C03 Control TRUE 0 1 Sample
## # ℹ 164 more rows
## # ℹ 1 more variable: is_ambiguous <lgl>
#Fourth, let's track reads
track_reads_demulticoder_its<-read.csv("demulticoder/data/track_reads_its.csv", row.names = 1)
track_reads_demulticoder_rps10<-read.csv("demulticoder/data/track_reads_rps10.csv", row.names = 1)
its_summary_reads<-summary(track_reads_demulticoder_its)
print(its_summary_reads)
## input filtered denoisedF denoisedR
## Min. : 82 Min. : 1 Min. : 1 Min. : 1
## 1st Qu.: 23440 1st Qu.:12671 1st Qu.:12375 1st Qu.:12308
## Median : 37507 Median :23008 Median :22684 Median :22513
## Mean : 42989 Mean :25466 Mean :25212 Mean :25078
## 3rd Qu.: 57640 3rd Qu.:31912 3rd Qu.:31614 3rd Qu.:31508
## Max. :136509 Max. :87421 Max. :87247 Max. :87170
## merged nonchim
## Min. : 0 Min. : 0
## 1st Qu.:11794 1st Qu.:11794
## Median :21770 Median :21770
## Mean :24322 Mean :24230
## 3rd Qu.:30406 3rd Qu.:30050
## Max. :86880 Max. :86868
rps10_summary_reads<-summary(track_reads_demulticoder_rps10)
print(rps10_summary_reads)
## input filtered denoisedF denoisedR
## Min. : 91 Min. : 26 Min. : 7 Min. : 6
## 1st Qu.: 37994 1st Qu.: 29199 1st Qu.: 29186 1st Qu.: 29184
## Median : 61070 Median : 48085 Median : 47950 Median : 48060
## Mean : 73511 Mean : 56500 Mean : 56435 Mean : 56433
## 3rd Qu.: 95487 3rd Qu.: 74825 3rd Qu.: 74408 3rd Qu.: 74436
## Max. :263101 Max. :219382 Max. :218997 Max. :219291
## merged nonchim
## Min. : 0 Min. : 0
## 1st Qu.: 29172 1st Qu.: 29172
## Median : 47653 Median : 47653
## Mean : 56030 Mean : 55349
## 3rd Qu.: 73485 3rd Qu.: 73400
## Max. :218521 Max. :217372
# Function to extract taxonomic levels and count unique entries with bootstrap filtering
# Function to extract taxonomic levels and count unique entries
analyze_taxonomy <- function(data) {
# Get taxonomy data
tax_data <- data$dada2_tax
# Split taxonomy string into components
tax_splits <- strsplit(tax_data, ";")
# Function to safely extract taxonomic level with proper rank checking
extract_tax_level <- function(tax_array, rank) {
# Find the position containing the rank
rank_pos <- grep(rank, tax_array)
if(length(rank_pos) > 0) {
# Extract and clean the taxonomy name
tax <- tax_array[rank_pos]
# Remove the rank pattern and any leading/trailing whitespace
cleaned_tax <- trimws(gsub(paste0("--\\d+--", rank), "", tax))
return(cleaned_tax)
}
return(NA)
}
# Extract each taxonomic level with proper hierarchy checking
families <- sapply(tax_splits, function(x) extract_tax_level(x, "Family"))
genera <- sapply(tax_splits, function(x) extract_tax_level(x, "Genus"))
species <- sapply(tax_splits, function(x) extract_tax_level(x, "Species"))
# Create a data frame to check hierarchy consistency
tax_df <- data.frame(
Family = families,
Genus = genera,
Species = species,
stringsAsFactors = FALSE
)
# Remove NA values before counting
families <- families[!is.na(families)]
genera <- genera[!is.na(genera)]
species <- species[!is.na(species)]
# Count unique entries
unique_counts <- list(
families = length(unique(families)),
genera = length(unique(genera)),
species = length(unique(species))
)
# Get prevalence with hierarchy checking
family_prev <- table(families)
genus_prev <- table(genera)
species_prev <- table(species)
# Sort prevalence tables
family_prev <- sort(family_prev, decreasing = TRUE)
genus_prev <- sort(genus_prev, decreasing = TRUE)
species_prev <- sort(species_prev, decreasing = TRUE)
# Print top 5 of each level for verification
cat("\nTop 5 most prevalent Families:\n")
print(head(family_prev, 5))
cat("\nTop 5 most prevalent Genera:\n")
print(head(genus_prev, 5))
cat("\nTop 5 most prevalent Species:\n")
print(head(species_prev, 5))
# Get most prevalent taxa with hierarchy verification
most_prevalent <- list(
family = if(length(family_prev) > 0) names(family_prev)[1] else "None found",
genus = if(length(genus_prev) > 0) names(genus_prev)[1] else "None found",
species = if(length(species_prev) > 0) names(species_prev)[1] else "None found"
)
return(list(
unique_counts = unique_counts,
most_prevalent = most_prevalent,
tax_df = tax_df # Return full taxonomy dataframe for inspection
))
}
# Run analysis for both datasets
if(exists("asv_matrix_rps10")) {
rps10_results <- analyze_taxonomy(asv_matrix_rps10)
}
##
## Top 5 most prevalent Families:
## families
## Pythiaceae Peronosporaceae Saprolegniaceae Lagenidiaceae Salisapiliaceae
## 172 125 20 10 9
##
## Top 5 most prevalent Genera:
## genera
## Pythium Phytophthora Aphanomyces Peronospora Saprolegnia
## 153 96 16 16 11
##
## Top 5 most prevalent Species:
## species
## Phytophthora_sp.kununurra Pythium_dissotocum Phytophthora_citrophthora
## 24 20 19
## Phytophthora_sojae Pythium_irregulare
## 13 13
if(exists("asv_matrix_its")) {
its_results <- analyze_taxonomy(asv_matrix_its)
}
##
## Top 5 most prevalent Families:
## families
## Fungi_fam_Incertae_sedis Serendipitaceae
## 366 200
## Rozellomycota_fam_Incertae_sedis Hyaloscyphaceae
## 176 157
## Tuberaceae
## 128
##
## Top 5 most prevalent Genera:
## genera
## Fungi_gen_Incertae_sedis Serendipita
## 366 196
## Rozellomycota_gen_Incertae_sedis Tuber
## 176 128
## Hyaloscypha
## 94
##
## Top 5 most prevalent Species:
## species
## Fungi_sp Serendipita_sp Rozellomycota_sp
## 366 190 176
## Tuber_pseudobrumale Archaeorhizomyces_sp
## 128 48
First we will configure the combined taxmap object
#load reference database
rps10_database <- read_fasta("demulticoder/data/rps10_reference_db.fa")
innoc_species <- c(Cin = "Phytophthora_cinnamomi", Plu = "Phytophthora_plurivora", Cry = "Pythium_cryptoirregulare")
innoc_regex <- paste0('(', paste0(innoc_species, collapse = '|'), ')')
innoc_seqs <- rps10_database[grepl(names(rps10_database), pattern = innoc_regex)]
innoc_seqs <- innoc_seqs[! duplicated(innoc_seqs)]
names(innoc_seqs) <- str_match(names(innoc_seqs), pattern = innoc_regex)[, 2]
iupac_match <- function(asv_chars, ref_chars) {
map2_lgl(asv_chars, ref_chars, function(asv, ref) grepl(asv, pattern = paste0('[', IUPAC_CODE_MAP[ref], ']+')))
}
# Count number of mismatches in an alignment, allowing for IUPAC codes in reference
align_mismatch <- function(alignment) {
asv_chars <- strsplit(as.character(alignment@pattern), '')[[1]]
ref_chars <- strsplit(as.character(alignment@subject), '')[[1]]
sum(! iupac_match(asv_chars, ref_chars))
}
# Align each sequence to each asv and make table of results"
aligned_data_path <- file.path("demulticoder/data", "infection_aligned_data.rds")
if (file.exists(aligned_data_path)) {
aligned_data <- readRDS(aligned_data_path)
} else {
aligned_data <- future_map_dfr(seq_along(innoc_seqs), function(i) {
aligned <- lapply(abundance$sequence, function(asv) pairwiseAlignment(pattern = asv, subject = innoc_seqs[i], type = 'global-local'))
print(paste("Processing species:", names(innoc_seqs)[i]))
tibble(
species = names(innoc_seqs)[i],
align_len = map_dbl(aligned, nchar),
mismatch = map_dbl(aligned, align_mismatch),
pid = (align_len - mismatch) / align_len,
asv_seq = abundance$sequence,
ref_seq = innoc_seqs[i],
alignment = aligned
)
})
saveRDS(aligned_data, file = aligned_data_path)
}
# Convert ASV abundances to proportions for use with matches
abundance_prop <- abundance
abundance_prop[metadata$sample_name] <- lapply(abundance_prop[metadata$sample_name], function(counts) counts/sum(counts))
innoculum_asv_mismatch_threshold <-1
# Sum the read counts of ASVs representing the same species used for inoculation
infection_data <- aligned_data %>%
filter(mismatch <= innoculum_asv_mismatch_threshold) %>%
left_join(abundance_prop, by = c(asv_seq = "sequence")) %>%
group_by(asv_seq) %>% slice_min(mismatch, with_ties = FALSE) # Only consider the best match per ASV
## Update abundance matrix and metadata with results
inoculum_asv_key <- setNames(infection_data$species, infection_data$asv_seq)
abundance <- mutate(abundance, is_inoculum = sequence %in% infection_data$asv_seq, inoculum = inoculum_asv_key[sequence], .after = "dada2_tax")
write_csv(abundance, file.path('demulticoder/data', 'abundance_with_infection_data.csv'))
#Let's append metadata table with more info
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
mutate(species = species_name_key[species]) %>%
mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
left_join(sample_infection_data, by = "sample_name")
# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(NA)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]])
}
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
if (metadata$sample_type[i] == "Mock community" || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(sum(metadata[i, prop_cols]))
} else {
expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
return(sum(metadata[i, prop_cols]))
}
})
# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0)
}
})
# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
}
})
# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
##
## FALSE TRUE
## 88 82
table(metadata$only_expected_innoc)
##
## FALSE TRUE
## 107 63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
## Cin Control Cry Plu
## 26 9 16 8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))
# Get per-sample proportions of reads
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
tidyr::gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
mutate(species = species_name_key[species]) %>%
mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
left_join(sample_infection_data, by = "sample_name")
# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(NA)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]])
}
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
if (metadata$sample_type[i] == "Mock community" || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(sum(metadata[i, prop_cols]))
} else {
expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
return(sum(metadata[i, prop_cols]))
}
})
# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0)
}
})
# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
}
})
# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
##
## FALSE TRUE
## 88 82
table(metadata$only_expected_innoc)
##
## FALSE TRUE
## 107 63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
## Cin Control Cry Plu
## 26 9 16 8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))
metadata <- filter(metadata, ! is_ambiguous)
abundance <- filter(abundance, ! is_inoculum)
#Filter out mock community samples and also any mock community samples as well
metadata <- filter(metadata, sample_type != "Mock community" & sample_type != "Negative control")
obj <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj$data) <- c('abund', 'score')
obj <- transmute_obs(obj, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
# For diversity calculations, we'll use proportions of read depth
# We'll set any proportion to 0 that is less than the inverse of the read count of the non-control sample with the fewest reads.
# This should account for unequal sample read depth without the randomness of rarefaction.
metadata$raw_count <- colSums(obj$data$abund[, metadata$sample_name])
lowest_count <- min(metadata$raw_count[! is.na(metadata$organism)])
lowest_count
## [1] 12676
obj$data$prop <- calc_obs_props(obj, data = 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 162 columns for 2707 observations.
obj$data$prop <- zero_low_counts(obj, data = 'prop', min_count = 1 / lowest_count, cols = metadata$sample_name)
## Zeroing 2166 of 438534 counts less than 7.88892395077311e-05.
obj$data$prop
## # A tibble: 2,707 × 163
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bga 0.0821 0.101 0.0414 0.0552 0 0.0366 0.168 0
## 2 bgb 0.00867 0.0190 0.0643 0.00314 0.0220 0.00554 0.0162 2.47e-2
## 3 bgc 0.134 0 0 0 0 0.000293 0 1.67e-1
## 4 bgd 0.00482 0 0.0314 0.00815 0.00514 0 0 0
## 5 bge 0.00584 0.00589 0.00528 0.00252 0.0379 0.00453 0.00942 2.03e-2
## 6 bgf 0.000106 0.0137 0.00153 0.00339 0.0173 0.00664 0.00864 6.26e-2
## 7 bgg 0.000158 0.000228 0.000208 0.000121 0.00139 0 0 9.56e-4
## 8 bgh 0.00322 0.0271 0.00813 0.000482 0.00190 0.000601 0.0293 2.66e-3
## 9 bgc 0 0 0.188 0 0 0.0288 0 3.37e-3
## 10 bgc 0 0 0.183 0.0323 0.242 0 0.0507 2.32e-1
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S17 <dbl>, S18 <dbl>, S2 <dbl>, S20 <dbl>, S21 <dbl>,
## # S22 <dbl>, S23 <dbl>, S24 <dbl>, S25 <dbl>, S26 <dbl>, S27 <dbl>,
## # S28 <dbl>, S29 <dbl>, S3 <dbl>, S31 <dbl>, S32 <dbl>, S33 <dbl>, S34 <dbl>,
## # S35 <dbl>, S36 <dbl>, S37 <dbl>, S38 <dbl>, S39 <dbl>, S4 <dbl>, S40 <dbl>,
## # S41 <dbl>, S42 <dbl>, S43 <dbl>, S45 <dbl>, S46 <dbl>, S47 <dbl>,
## # S48 <dbl>, S49 <dbl>, S5 <dbl>, S50 <dbl>, S51 <dbl>, S52 <dbl>, …
obj$data$prop[metadata$sample_name] <- map(metadata$sample_name, function(id) {
out <- obj$data$prop[[id]]
out[is.na(out) | is.nan(out)] <- 0
out
})
obj$data$prop
## # A tibble: 2,707 × 163
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bga 0.0821 0.101 0.0414 0.0552 0 0.0366 0.168 0
## 2 bgb 0.00867 0.0190 0.0643 0.00314 0.0220 0.00554 0.0162 2.47e-2
## 3 bgc 0.134 0 0 0 0 0.000293 0 1.67e-1
## 4 bgd 0.00482 0 0.0314 0.00815 0.00514 0 0 0
## 5 bge 0.00584 0.00589 0.00528 0.00252 0.0379 0.00453 0.00942 2.03e-2
## 6 bgf 0.000106 0.0137 0.00153 0.00339 0.0173 0.00664 0.00864 6.26e-2
## 7 bgg 0.000158 0.000228 0.000208 0.000121 0.00139 0 0 9.56e-4
## 8 bgh 0.00322 0.0271 0.00813 0.000482 0.00190 0.000601 0.0293 2.66e-3
## 9 bgc 0 0 0.188 0 0 0.0288 0 3.37e-3
## 10 bgc 0 0 0.183 0.0323 0.242 0 0.0507 2.32e-1
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S17 <dbl>, S18 <dbl>, S2 <dbl>, S20 <dbl>, S21 <dbl>,
## # S22 <dbl>, S23 <dbl>, S24 <dbl>, S25 <dbl>, S26 <dbl>, S27 <dbl>,
## # S28 <dbl>, S29 <dbl>, S3 <dbl>, S31 <dbl>, S32 <dbl>, S33 <dbl>, S34 <dbl>,
## # S35 <dbl>, S36 <dbl>, S37 <dbl>, S38 <dbl>, S39 <dbl>, S4 <dbl>, S40 <dbl>,
## # S41 <dbl>, S42 <dbl>, S43 <dbl>, S45 <dbl>, S46 <dbl>, S47 <dbl>,
## # S48 <dbl>, S49 <dbl>, S5 <dbl>, S50 <dbl>, S51 <dbl>, S52 <dbl>, …
#Let's modify the metadata sheet
metadata <- metadata %>%
mutate(organism = fct_relevel(ordered(organism), "Control", "Cin", "Plu", "Cry"))
abund_table <- obj$data$abund[metadata$sample_name]
metadata$richness <- vegan::specnumber(abund_table, MARGIN = 2)
metadata$shannon <- vegan::diversity(abund_table, MARGIN = 2, index = "shannon")
metadata$invsimpson <- vegan::diversity(abund_table, MARGIN = 2, index = "invsimpson")
write.csv(metadata, file.path("demulticoder", "results/alpha_diversity.csv"))
print(metadata)
## # A tibble: 162 × 21
## sample_name plate well organism flooded path_conc experiment sample_type
## <chr> <dbl> <chr> <ord> <lgl> <dbl> <dbl> <chr>
## 1 S1 1 A01 Cry TRUE 100 1 Sample
## 2 S10 1 B02 Cry TRUE 1 1 Sample
## 3 S11 1 C02 Plu FALSE 1 1 Sample
## 4 S12 1 D02 Plu TRUE 1 1 Sample
## 5 S13 1 E02 Control TRUE 0 1 Sample
## 6 S14 1 F02 Cin TRUE 1 1 Sample
## 7 S15 1 H02 Cin FALSE 100 1 Sample
## 8 S16 1 A03 Cry FALSE 100 1 Sample
## 9 S17 1 B03 Cry TRUE 100 1 Sample
## 10 S18 1 C03 Control TRUE 0 1 Sample
## # ℹ 152 more rows
## # ℹ 13 more variables: is_ambiguous <lgl>, cin_prop <dbl>, cry_prop <dbl>,
## # plu_prop <dbl>, expected_innoc_prop <dbl>, unexpected_innoc_prop <dbl>,
## # expected_innoc <lgl>, only_expected_innoc <lgl>, valid_inoc <lgl>,
## # raw_count <dbl>, richness <int>, shannon <dbl>, invsimpson <dbl>
plotted_factors <- c('Organism' = 'organism', 'Flooded' = 'flooded', 'Pathogen Concentration' = 'path_conc', 'Trial' = 'experiment')
# Reformat data for plotting
alpha_plot_data <- plotted_factors %>%
map2_dfr(names(plotted_factors),
function(factor, factor_name) {
out <- metadata
out$factor <- factor_name
out$value <- as.character(metadata[[factor]])
return(out)
}) %>%
mutate(path_conc = factor(path_conc,
levels = sort(unique(path_conc)),
labels = paste(sort(unique(path_conc)), 'CFU/g'),
ordered = TRUE)) %>%
filter(sample_type == 'Sample') %>%
select(sample_name, factor, value, invsimpson) %>%
tidyr::gather(key = "index", value = "diversity", -sample_name, -factor, -value) %>%
mutate(value = forcats::fct_relevel(ordered(value), "Control", "Cin", "Plu", "Cry"))
# ANOVA and Tukey's HSD
anova_and_hsd <- function(x) {
anova_result <- aov(diversity ~ value, x)
tukey_result <- agricolae::HSD.test(anova_result, "value", group = TRUE)
group_data <- tukey_result$groups[order(rownames(tukey_result$groups)),]
group_key <- setNames(group_data$groups, rownames(group_data))
group_key[as.character(x$value)]
}
alpha_plot_data$group <- unlist(map(split(alpha_plot_data, alpha_plot_data$factor)[unique(alpha_plot_data$factor)], anova_and_hsd))
alpha_subplot <- ggplot(alpha_plot_data, aes(x = value, y = diversity)) +
geom_boxplot() +
geom_text(aes(x = value,
y = max(diversity) + 2,
label = group),
col = 'black',
size = 5) +
facet_grid( ~ factor, scales = "free") +
labs(x = NULL, y = 'Diversity (Inverse Simpson)') +
guides(color = "none") +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position="bottom")
library(cowplot)
set.seed(1)
prob_table <- obj$data$prop[metadata$sample_name]
nmds_plot_data <- function(prob_table) {
metadata <- metadata[metadata$sample_name %in% colnames(prob_table), ]
set.seed(1)
nmds_results <- vegan::metaMDS(t(prob_table), trymax = 1000, k = 2, trace = 0)
nmds_data <- nmds_results$points %>%
as_tibble() %>%
bind_cols(metadata)
names(nmds_data)[1:2] <- paste0("NMDS", 1:2)
return(nmds_data)
}
nmds_data <- nmds_plot_data(prob_table[! is.na(metadata$organism) & metadata$valid_inoc])
nmds_factors <- c(Flooded = 'flooded', Organism = 'organism', 'Pathogen CFU/g' = 'path_conc', 'Trial' = 'experiment')
make_one_plot <- function(factor, name) {
nmds_data %>%
mutate(factor = as.character(nmds_data[[factor]]),
NMDS1 = scales::rescale(NMDS1),
NMDS2 = scales::rescale(NMDS2)) %>%
mutate(factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu", "Cry")) %>%
ggplot(aes_string(x = "NMDS1", y = "NMDS2", color = "factor", label = "sample_name")) +
geom_point() +
coord_fixed() +
viridis::scale_color_viridis(discrete = TRUE, end = .9) +
labs(color = NULL, x = "NMDS1", y = "NMDS2") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
axis.ticks = element_line(color = "black"),
plot.margin = unit(rep(0.04, 4), "cm"),
legend.position = "bottom",
legend.text = element_text(size = 10),
legend.key.height = unit(0.5, 'cm'),
legend.key.width = unit(0.2, 'cm'))
}
nmds_subplots <- map2(nmds_factors, names(nmds_factors), make_one_plot)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
nmds_plot <- ggpubr::ggarrange(plotlist = c(list(ggplot() + theme_void()), nmds_subplots),
nrow = 1, widths = c(0.15, 1, 1, 1, 1))
ggsave(nmds_plot, path = "demulticoder/figures", filename = "nmds.pdf",
width = 7, height = 8)
print(nmds_plot)
combined_div_plot <- plot_grid(alpha_subplot, nmds_plot, ncol = 1, labels = c('A', 'B'),
rel_heights = c(1, 0.9))
combined_div_plot
ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.pdf",
width = 13, height = 8, bg = "#FFFFFF")
ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.svg",
width = 13, height = 8, bg = "#FFFFFF")
### Export coordinates
write.csv(nmds_data, file.path("demulticoder", "results/compiled_stats.csv"))
dist_matrix <- vegdist(t(prob_table), method = "bray")
adonis2_full <- adonis2(dist_matrix ~ organism + flooded + path_conc + experiment,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ organism + flooded + path_conc + experiment, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## organism 3 1.254 0.02384 1.3900 0.048 *
## flooded 1 1.848 0.03514 6.1469 0.001 ***
## path_conc 1 0.276 0.00524 0.9165 0.545
## experiment 1 2.614 0.04971 8.6953 0.001 ***
## Residual 155 46.598 0.88608
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_org <- adonis2(dist_matrix ~ organism,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_org)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ organism, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## organism 3 1.254 0.02384 1.2861 0.101
## Residual 158 51.336 0.97616
## Total 161 52.590 1.00000
adonis2_flooded <- adonis2(dist_matrix ~ flooded,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_flooded)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ flooded, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## flooded 1 1.849 0.03515 5.8294 0.001 ***
## Residual 160 50.741 0.96485
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_path_conc <- adonis2(dist_matrix ~ path_conc,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_path_conc)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ path_conc, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## path_conc 1 0.308 0.00586 0.9431 0.466
## Residual 160 52.281 0.99414
## Total 161 52.590 1.00000
adonis2_experiment <- adonis2(dist_matrix ~ experiment,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_experiment)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ experiment, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## experiment 1 2.613 0.04969 8.3655 0.001 ***
## Residual 160 49.977 0.95031
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Make sure our factors are input properly as factors
metadata$path_conc <- as.factor(metadata$path_conc)
metadata$path_conc<- factor(metadata$path_conc, levels = c("0", "1", "100"))
metadata$organism <- as.factor(metadata$organism)
metadata$organism<- factor(metadata$organism, levels = c("Control", "Cin", "Cry", "Plu"))
metadata$flooded <- as.factor(metadata$flooded)
metadata$flooded <- factor(metadata$flooded, levels = c("TRUE", "FALSE"))
metadata$experiment <- as.factor(metadata$experiment)
metadata$experiment<- factor(metadata$experiment, levels = c("1", "2"))
abundance$is_low_abund <- rowSums(abundance[, metadata$sample_name]) < 5
obj2 <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj2$data) <- c('abund', 'score')
obj2 <- transmute_obs(obj2, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
print(obj2)
## <Taxmap>
## 1575 taxa: aab. Eukaryota, aac. Fungi ... cip. Fungi_sp
## 1575 edges: NA->aab, aab->aac, aab->aad ... cin->cio, cio->cip
## 2 data sets:
## abund:
## # A tibble: 2,707 × 180
## taxon_id sequence dada2_tax is_inoculum inoculum S1 S10
## <chr> <chr> <chr> <lgl> <chr> <int> <int>
## 1 bga AAGTCGTAA… Eukaryot… FALSE <NA> 4670 3565
## 2 bgb AAGTCGTAA… Eukaryot… FALSE <NA> 493 668
## 3 bgc AAAAAGTCG… Eukaryot… FALSE <NA> 7603 0
## # ℹ 2,704 more rows
## # ℹ 173 more variables: S100 <int>, S101 <int>, S102 <int>,
## # S103 <int>, S104 <int>, S105 <int>, S106 <int>, S107 <int>,
## # S108 <int>, S109 <int>, …
## score:
## # A tibble: 23,852 × 4
## taxon_id sequence boot rank
## <chr> <chr> <chr> <chr>
## 1 aab AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 Doma…
## 2 aac AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 King…
## 3 aae AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 73 Phyl…
## # ℹ 23,849 more rows
## 0 functions:
obj2$data$tax_abund <- calc_taxon_abund(obj2, data = "abund", cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1575 taxa
obj2$data$abund_prop <- calc_obs_props(obj2, data = "abund", cols = metadata$sample_name, groups = rep("tax_prop", nrow(metadata)))
## Calculating proportions from counts for 162 columns in 1 groups for 2707 observations.
obj2$data$tax_prop <- calc_taxon_abund(obj2, data = "abund_prop", cols = "tax_prop")
## Summing per-taxon counts from 1 columns for 1575 taxa
obj2$data$abund_prop <- NULL
obj_subset <- obj2 %>%
filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE))
min_bootstrap <- 0
obj_subset$data$score$boot <- as.numeric(obj_subset$data$score$boot)
max_boot <- obj_subset$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_subset <- filter_taxa(obj_subset, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))
## [1] 7.253048e-18 9.976959e-01
## [1] -24.788261 5.134441
obj_subset <- obj2 %>%
filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE))
min_bootstrap <- 60
obj_subset$data$score$boot <- as.numeric(obj_subset$data$score$boot)
max_boot <- obj_subset$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_subset <- filter_taxa(obj_subset, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))
obj_subset$data$asv_prop <- calc_obs_props(obj_subset, 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 162 columns for 2707 observations.
obj_subset$data$tax_abund <- calc_taxon_abund(obj_subset, 'abund', cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1125 taxa
obj_subset$data$tax_prop <- calc_taxon_abund(obj_subset, 'asv_prop', cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1125 taxa
obj_subset$data$tax_data <- calc_n_samples(obj_subset, 'tax_prop', cols = metadata$sample_name[metadata$sample_type == 'Sample'])
## Calculating number of samples with a value greater than 0 for 162 columns for 1125 observations
obj_subset$data$tax_data$mean_prop <- rowMeans(obj_subset$data$tax_prop[, metadata$sample_name])
# Calculate the relative standard deviation for each taxon as a measure of how consistently it was found.
rsd <- function(x, na.rm = FALSE) {sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)}
obj_subset$data$tax_data$rel_stand_dev <- map_dbl(1:nrow(obj_subset$data$tax_abund), function(i) {
rsd(unlist(obj_subset$data$tax_abund[i, metadata$sample_name]), na.rm = TRUE)
})
obj_subset
## <Taxmap>
## 1125 taxa: aab. Eukaryota, aac. Fungi ... cip. Fungi_sp
## 1125 edges: NA->aab, aab->aac, aab->aad ... cin->cio, cio->cip
## 6 data sets:
## abund:
## # A tibble: 2,707 × 180
## taxon_id sequence dada2_tax is_inoculum inoculum S1 S10
## <chr> <chr> <chr> <lgl> <chr> <int> <int>
## 1 acm AAGTCGTAA… Eukaryot… FALSE <NA> 4670 3565
## 2 bgb AAGTCGTAA… Eukaryot… FALSE <NA> 493 668
## 3 bgc AAAAAGTCG… Eukaryot… FALSE <NA> 7603 0
## # ℹ 2,704 more rows
## # ℹ 173 more variables: S100 <int>, S101 <int>, S102 <int>,
## # S103 <int>, S104 <int>, S105 <int>, S106 <int>, S107 <int>,
## # S108 <int>, S109 <int>, …
## score:
## # A tibble: 22,551 × 4
## taxon_id sequence boot rank
## <chr> <chr> <dbl> <chr>
## 1 aab AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 Doma…
## 2 aac AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 King…
## 3 aae AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 73 Phyl…
## # ℹ 22,548 more rows
## tax_abund:
## # A tibble: 1,125 × 163
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 aab 56860 35128 28797 58035 46889 68269 16554 24056 49958
## 2 aac 27729 12225 22562 10819 27587 10124 7579 22610 23009
## 3 aad 29131 22903 6235 47216 19302 58145 8975 1446 26949
## # ℹ 1,122 more rows
## # ℹ 153 more variables: S18 <dbl>, S2 <dbl>, S20 <dbl>,
## # S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>, S25 <dbl>,
## # S26 <dbl>, S27 <dbl>, …
## tax_prop:
## # A tibble: 1,125 × 163
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 aab 1 1 1 1 1.00 1 1 1
## 2 aac 0.488 0.348 0.783 0.186 0.588 0.148 0.458 0.940
## 3 aad 0.512 0.652 0.217 0.814 0.412 0.852 0.542 0.0601
## # ℹ 1,122 more rows
## # ℹ 154 more variables: S17 <dbl>, S18 <dbl>, S2 <dbl>,
## # S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>,
## # S25 <dbl>, S26 <dbl>, …
## asv_prop:
## # A tibble: 2,707 × 163
## taxon_id S1 S10 S11 S12 S13 S14 S15
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 acm 0.0821 0.101 0.0414 0.0552 0 0.0366 0.168
## 2 bgb 0.00867 0.0190 0.0643 0.00314 0.0220 0.00554 0.0162
## 3 bgc 0.134 0 0 0 0 0.000293 0
## # ℹ 2,704 more rows
## # ℹ 155 more variables: S16 <dbl>, S17 <dbl>, S18 <dbl>,
## # S2 <dbl>, S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>,
## # S24 <dbl>, S25 <dbl>, …
## tax_data:
## # A tibble: 1,125 × 4
## taxon_id n_samples mean_prop rel_stand_dev
## <chr> <int> <dbl> <dbl>
## 1 aab 162 1 0.443
## 2 aac 162 0.416 0.652
## 3 aad 162 0.584 0.711
## # ℹ 1,122 more rows
## 0 functions:
set.seed(1000)
obj_subset %>%
filter_taxa(! is_stem) %>%
filter_taxa(n_samples >= 10, supertaxa = TRUE, reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "_sp$"), reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "incertae_sedis", ignore.case = TRUE), reassign_obs = FALSE) %>%
remove_redundant_names() %>%
heat_tree(node_size = mean_prop,
edge_size = n_samples,
node_color = ifelse(is.na(rel_stand_dev), 0, rel_stand_dev),
#node_color_range = c("#aaaaaa", "#8da0cb", "#66c2a5", "#a6d854", "#fc8d62", "red"),
node_label = taxon_names,
node_size_range = c(0.008, 0.025),
node_label_size_range = c(0.012, 0.018),
edge_label_size_range = c(0.010, 0.013),
node_size_interval = c(0, 1),
edge_size_range = c(0.001, 0.008),
#layout = "da", initial_layout = "re",
node_color_axis_label = "Relative standard deviation",
node_size_axis_label = "Mean proportion of reads",
edge_size_axis_label = "Number of samples",
node_color_digits = 2,
node_size_digits = 2,
edge_color_digits = 2,
edge_size_digits = 2,
#aspect_ratio = 1.618,
output_file = file.path('demulticoder/figures', '/heattree_mostabund_taxa.pdf'))
asv_matrix_rps10<-read.csv("demulticoder/data/final_asv_abundance_matrix_rps10.csv")
asv_matrix_rps10$dada2_tax <- asv_matrix_rps10$dada2_tax <- gsub("Stramenopila", "Eukaryota--100--Domain;Stramenopila", asv_matrix_rps10$dada2_tax)
asv_matrix_rps10 <- asv_matrix_rps10[, -1]
colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)] <- gsub("_rps10$", "", colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)])
asv_matrix_rps10_oldb<-read.csv("demulticoder/data_olddb//final_asv_abundance_matrix_rps10.csv")
asv_matrix_rps10_oldb$dada2_tax <- asv_matrix_rps10_oldb$dada2_tax <- gsub("Stramenopila", "Eukaryota--100--Domain;Stramenopila", asv_matrix_rps10_oldb$dada2_tax)
asv_matrix_rps10_oldb <- asv_matrix_rps10_oldb[, -1]
colnames(asv_matrix_rps10_oldb)[3:ncol(asv_matrix_rps10_oldb)] <- gsub("_rps10$", "", colnames(asv_matrix_rps10_oldb)[3:ncol(asv_matrix_rps10_oldb)])
metadata_path <- file.path("demulticoder/data/metadata.csv")
metadata <- read_csv(metadata_path)
## Rows: 174 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_name, well, organism, sample_type
## dbl (3): plate, path_conc, experiment
## lgl (2): flooded, is_ambiguous
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(metadata)
## # A tibble: 174 × 9
## sample_name plate well organism flooded path_conc experiment sample_type
## <chr> <dbl> <chr> <chr> <lgl> <dbl> <dbl> <chr>
## 1 S1 1 A01 Cry TRUE 100 1 Sample
## 2 S10 1 B02 Cry TRUE 1 1 Sample
## 3 S11 1 C02 Plu FALSE 1 1 Sample
## 4 S12 1 D02 Plu TRUE 1 1 Sample
## 5 S13 1 E02 Control TRUE 0 1 Sample
## 6 S14 1 F02 Cin TRUE 1 1 Sample
## 7 S15 1 H02 Cin FALSE 100 1 Sample
## 8 S16 1 A03 Cry FALSE 100 1 Sample
## 9 S17 1 B03 Cry TRUE 100 1 Sample
## 10 S18 1 C03 Control TRUE 0 1 Sample
## # ℹ 164 more rows
## # ℹ 1 more variable: is_ambiguous <lgl>
obj_newdb <- parse_tax_data(asv_matrix_rps10, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj_newdb$data) <- c('abund', 'score')
obj_newdb <- transmute_obs(obj_newdb, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
# For diversity calculations, we'll use proportions of read depth
# We'll set any proportion to 0 that is less than the inverse of the read count of the non-control sample with the fewest reads.
# This should account for unequal sample read depth without the randomness of rarefaction.
metadata$raw_count <- colSums(obj_newdb$data$abund[, metadata$sample_name])
lowest_count <- min(metadata$raw_count[! is.na(metadata$organism)])
lowest_count
## [1] 251
obj_newdb$data$prop <- calc_obs_props(obj_newdb, data = 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 174 columns for 347 observations.
obj_newdb$data$prop <- zero_low_counts(obj_newdb, data = 'prop', min_count = 1 / lowest_count, cols = metadata$sample_name)
## Zeroing 507 of 60378 counts less than 0.00398406374501992.
obj_newdb$data$prop
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17 S18
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bm 0.409 0.0490 0.912 0 0.589 0 0 0.0384 0.704 0.660
## 2 bn 0 0 0 0 0 0 0.716 0 0 0
## 3 bo 0 0 0 0.0101 0.0194 0.550 0 0 0 0.0126
## 4 bp 0 0 0 0 0 0 0 0 0 0
## 5 bq 0 0 0 0 0 0 0 0 0 0
## 6 br 0.568 0.131 0 0.0530 0.381 0.0148 0 0 0.0917 0.0660
## 7 br 0 0 0 0 0 0.0247 0 0 0 0
## 8 bs 0 0.818 0 0.774 0 0.391 0.265 0 0 0
## 9 bp 0 0 0 0.0646 0 0.0156 0 0 0 0
## 10 bt 0.00624 0 0 0.0416 0 0 0 0.935 0.182 0.209
## # ℹ 337 more rows
## # ℹ 164 more variables: S2 <dbl>, S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>,
## # S24 <dbl>, S25 <dbl>, S26 <dbl>, S27 <dbl>, S28 <dbl>, S29 <dbl>, S3 <dbl>,
## # S31 <dbl>, S32 <dbl>, S33 <dbl>, S34 <dbl>, S35 <dbl>, S36 <dbl>,
## # S37 <dbl>, S38 <dbl>, S39 <dbl>, S4 <dbl>, S40 <dbl>, S41 <dbl>, S42 <dbl>,
## # S43 <dbl>, S45 <dbl>, S46 <dbl>, S47 <dbl>, S48 <dbl>, S49 <dbl>, S5 <dbl>,
## # S50 <dbl>, S51 <dbl>, S52 <dbl>, S53 <dbl>, S54 <dbl>, S55 <dbl>, …
obj_newdb$data$prop[metadata$sample_name] <- map(metadata$sample_name, function(id) {
out <- obj_newdb$data$prop[[id]]
out[is.na(out) | is.nan(out)] <- 0
out
})
obj_newdb$data$prop
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17 S18
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bm 0.409 0.0490 0.912 0 0.589 0 0 0.0384 0.704 0.660
## 2 bn 0 0 0 0 0 0 0.716 0 0 0
## 3 bo 0 0 0 0.0101 0.0194 0.550 0 0 0 0.0126
## 4 bp 0 0 0 0 0 0 0 0 0 0
## 5 bq 0 0 0 0 0 0 0 0 0 0
## 6 br 0.568 0.131 0 0.0530 0.381 0.0148 0 0 0.0917 0.0660
## 7 br 0 0 0 0 0 0.0247 0 0 0 0
## 8 bs 0 0.818 0 0.774 0 0.391 0.265 0 0 0
## 9 bp 0 0 0 0.0646 0 0.0156 0 0 0 0
## 10 bt 0.00624 0 0 0.0416 0 0 0 0.935 0.182 0.209
## # ℹ 337 more rows
## # ℹ 164 more variables: S2 <dbl>, S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>,
## # S24 <dbl>, S25 <dbl>, S26 <dbl>, S27 <dbl>, S28 <dbl>, S29 <dbl>, S3 <dbl>,
## # S31 <dbl>, S32 <dbl>, S33 <dbl>, S34 <dbl>, S35 <dbl>, S36 <dbl>,
## # S37 <dbl>, S38 <dbl>, S39 <dbl>, S4 <dbl>, S40 <dbl>, S41 <dbl>, S42 <dbl>,
## # S43 <dbl>, S45 <dbl>, S46 <dbl>, S47 <dbl>, S48 <dbl>, S49 <dbl>, S5 <dbl>,
## # S50 <dbl>, S51 <dbl>, S52 <dbl>, S53 <dbl>, S54 <dbl>, S55 <dbl>, …
obj_olddb <- parse_tax_data(asv_matrix_rps10_oldb, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj_olddb$data) <- c('abund', 'score')
obj_olddb <- transmute_obs(obj_olddb, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
# For diversity calculations, we'll use proportions of read depth
# We'll set any proportion to 0 that is less than the inverse of the read count of the non-control sample with the fewest reads.
# This should account for unequal sample read depth without the randomness of rarefaction.
metadata$raw_count <- colSums(obj_olddb$data$abund[, metadata$sample_name])
lowest_count <- min(metadata$raw_count[! is.na(metadata$organism)])
lowest_count
## [1] 251
obj_olddb$data$prop <- calc_obs_props(obj_olddb, data = 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 174 columns for 347 observations.
obj_olddb$data$prop <- zero_low_counts(obj_olddb, data = 'prop', min_count = 1 / lowest_count, cols = metadata$sample_name)
## Zeroing 507 of 60378 counts less than 0.00398406374501992.
obj_olddb$data$prop
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17 S18
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bl 0.409 0.0490 0.912 0 0.589 0 0 0.0384 0.704 0.660
## 2 bm 0 0 0 0 0 0 0.716 0 0 0
## 3 bn 0 0 0 0.0101 0.0194 0.550 0 0 0 0.0126
## 4 bl 0 0 0 0 0 0 0 0 0 0
## 5 bo 0 0 0 0 0 0 0 0 0 0
## 6 bp 0.568 0.131 0 0.0530 0.381 0.0148 0 0 0.0917 0.0660
## 7 bq 0 0 0 0 0 0.0247 0 0 0 0
## 8 br 0 0.818 0 0.774 0 0.391 0.265 0 0 0
## 9 bl 0 0 0 0.0646 0 0.0156 0 0 0 0
## 10 bs 0.00624 0 0 0.0416 0 0 0 0.935 0.182 0.209
## # ℹ 337 more rows
## # ℹ 164 more variables: S2 <dbl>, S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>,
## # S24 <dbl>, S25 <dbl>, S26 <dbl>, S27 <dbl>, S28 <dbl>, S29 <dbl>, S3 <dbl>,
## # S31 <dbl>, S32 <dbl>, S33 <dbl>, S34 <dbl>, S35 <dbl>, S36 <dbl>,
## # S37 <dbl>, S38 <dbl>, S39 <dbl>, S4 <dbl>, S40 <dbl>, S41 <dbl>, S42 <dbl>,
## # S43 <dbl>, S45 <dbl>, S46 <dbl>, S47 <dbl>, S48 <dbl>, S49 <dbl>, S5 <dbl>,
## # S50 <dbl>, S51 <dbl>, S52 <dbl>, S53 <dbl>, S54 <dbl>, S55 <dbl>, …
obj_olddb$data$prop[metadata$sample_name] <- map(metadata$sample_name, function(id) {
out <- obj_olddb$data$prop[[id]]
out[is.na(out) | is.nan(out)] <- 0
out
})
obj_olddb$data$prop
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17 S18
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bl 0.409 0.0490 0.912 0 0.589 0 0 0.0384 0.704 0.660
## 2 bm 0 0 0 0 0 0 0.716 0 0 0
## 3 bn 0 0 0 0.0101 0.0194 0.550 0 0 0 0.0126
## 4 bl 0 0 0 0 0 0 0 0 0 0
## 5 bo 0 0 0 0 0 0 0 0 0 0
## 6 bp 0.568 0.131 0 0.0530 0.381 0.0148 0 0 0.0917 0.0660
## 7 bq 0 0 0 0 0 0.0247 0 0 0 0
## 8 br 0 0.818 0 0.774 0 0.391 0.265 0 0 0
## 9 bl 0 0 0 0.0646 0 0.0156 0 0 0 0
## 10 bs 0.00624 0 0 0.0416 0 0 0 0.935 0.182 0.209
## # ℹ 337 more rows
## # ℹ 164 more variables: S2 <dbl>, S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>,
## # S24 <dbl>, S25 <dbl>, S26 <dbl>, S27 <dbl>, S28 <dbl>, S29 <dbl>, S3 <dbl>,
## # S31 <dbl>, S32 <dbl>, S33 <dbl>, S34 <dbl>, S35 <dbl>, S36 <dbl>,
## # S37 <dbl>, S38 <dbl>, S39 <dbl>, S4 <dbl>, S40 <dbl>, S41 <dbl>, S42 <dbl>,
## # S43 <dbl>, S45 <dbl>, S46 <dbl>, S47 <dbl>, S48 <dbl>, S49 <dbl>, S5 <dbl>,
## # S50 <dbl>, S51 <dbl>, S52 <dbl>, S53 <dbl>, S54 <dbl>, S55 <dbl>, …
min_bootstrap <- 0
obj_newdb$data$score$boot <- as.numeric(obj_newdb$data$score$boot)
max_boot <- obj_newdb$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_newdb <- filter_taxa(obj_newdb, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))
obj_newdb$data$asv_prop <- calc_obs_props(obj_newdb, 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 174 columns for 347 observations.
obj_newdb$data$tax_abund <- calc_taxon_abund(obj_newdb, 'abund', cols = metadata$sample_name)
## Summing per-taxon counts from 174 columns for 143 taxa
obj_newdb$data$tax_prop <- calc_taxon_abund(obj_newdb, 'asv_prop', cols = metadata$sample_name)
## Summing per-taxon counts from 174 columns for 143 taxa
obj_newdb$data$tax_data <- calc_n_samples(obj_newdb, 'tax_prop', cols = metadata$sample_name[metadata$sample_type == 'Sample'])
## Calculating number of samples with a value greater than 0 for 162 columns for 143 observations
obj_newdb$data$tax_data$mean_prop <- rowMeans(obj_newdb$data$tax_prop[, metadata$sample_name])
obj_newdb %>%
filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE))
## <Taxmap>
## 143 taxa: ab. Eukaryota ... fn. Phytopythium_ostracodes
## 143 edges: NA->ab, ab->ac, ac->ad ... av->fl, ba->fm, aw->fn
## 7 data sets:
## abund:
## # A tibble: 347 × 177
## taxon_id sequence dada2_tax S1 S10 S100 S101 S102
## <chr> <chr> <chr> <int> <int> <int> <int> <int>
## 1 bm GAAAATCTTTGT… Eukaryot… 11976 1123 0 0 1125
## 2 bn GAAAATCTTTGT… Eukaryot… 0 0 0 0 17
## 3 bo GAAAATCTTTGT… Eukaryot… 0 0 15876 11386 9976
## # ℹ 344 more rows
## # ℹ 169 more variables: S103 <int>, S104 <int>, S105 <int>,
## # S106 <int>, S107 <int>, S108 <int>, S109 <int>, S11 <int>,
## # S110 <int>, S111 <int>, …
## score:
## # A tibble: 2,776 × 4
## taxon_id sequence boot rank
## <chr> <chr> <dbl> <chr>
## 1 ab GAAAATCTTTGTGTCGGTGGTTCAAGTCCGCCTCCAAACA… 100 Doma…
## 2 ac GAAAATCTTTGTGTCGGTGGTTCAAGTCCGCCTCCAAACA… 100 King…
## 3 ad GAAAATCTTTGTGTCGGTGGTTCAAGTCCGCCTCCAAACA… 100 Phyl…
## # ℹ 2,773 more rows
## prop:
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bm 0.409 0.0490 0.912 0 0.589 0 0 0.0384
## 2 bn 0 0 0 0 0 0 0.716 0
## 3 bo 0 0 0 0.0101 0.0194 0.550 0 0
## # ℹ 344 more rows
## # ℹ 166 more variables: S17 <dbl>, S18 <dbl>, S2 <dbl>,
## # S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>,
## # S25 <dbl>, S26 <dbl>, …
## asv_prop:
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bm 0.409 0.0490 0.912 0 0.589 0 0 0.0384
## 2 bn 0 0 0 0 0 0 0.716 0
## 3 bo 0 0 0 0.0101 0.0194 0.550 0 0
## # ℹ 344 more rows
## # ℹ 166 more variables: S17 <dbl>, S18 <dbl>, S2 <dbl>,
## # S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>,
## # S25 <dbl>, S26 <dbl>, …
## tax_abund:
## # A tibble: 143 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ab 29314 22903 6235 49267 19302 58145 33859 27995 32935
## 2 ac 29314 22903 6235 49267 19302 58145 33859 27995 32935
## 3 ad 29314 22903 6235 49267 19302 58145 33859 27995 32935
## # ℹ 140 more rows
## # ℹ 165 more variables: S18 <dbl>, S2 <dbl>, S20 <dbl>,
## # S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>, S25 <dbl>,
## # S26 <dbl>, S27 <dbl>, …
## tax_prop:
## # A tibble: 143 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ab 1 1 1 1 1 1 1 1 1
## 2 ac 1 1 1 1 1 1 1 1 1
## 3 ad 1 1 1 1 1 1 1 1 1
## # ℹ 140 more rows
## # ℹ 165 more variables: S18 <dbl>, S2 <dbl>, S20 <dbl>,
## # S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>, S25 <dbl>,
## # S26 <dbl>, S27 <dbl>, …
## And 1 more data sets: tax_data
## 0 functions:
# Calculate the relative standard deviation for each taxon as a measure of how consistently it was found.
rsd <- function(x, na.rm = FALSE) {sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)}
obj_newdb$data$tax_data$rel_stand_dev <- map_dbl(1:nrow(obj_newdb$data$tax_abund), function(i) {
rsd(unlist(obj_newdb$data$tax_abund[i, metadata$sample_name]), na.rm = TRUE)
})
max_boot <- obj_newdb$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
# Map the maximum bootstrap support to taxon IDs
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
# Add maximum bootstrap support to tax_data
obj_newdb$data$tax_data$max_boot <- max_boot[match(obj_newdb$data$tax_data$taxon_id, names(max_boot))]
set.seed(10)
obj_newdb %>%
filter_taxa(! is_stem) %>%
filter_taxa(n_samples >= 1, supertaxa = TRUE, reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "_sp$"), reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "incertae_sedis", ignore.case = TRUE), reassign_obs = FALSE) %>%
remove_redundant_names() %>%
heat_tree(node_size = mean_prop,
edge_size = n_samples,
node_color = ifelse(is.na(max_boot), 0, max_boot),
node_color_range = c("#D2042D","#fc8d62","#a6d854","#66c2a5","lightblue","#b8c0cf"),
node_label = taxon_names,
node_size_range = c(0.01, 0.03),
node_label_size_range = c(0.014, 0.02),
edge_label_size_range = c(0.012, 0.015),
node_size_interval = c(0, 1),
edge_size_range = c(0.002, 0.01),
layout = "da", initial_layout = "re",
node_color_axis_label = "Bootstrap support",
node_size_axis_label = "Mean proportion of reads",
edge_size_axis_label = "Number of samples",
node_color_digits = 2,
node_size_digits = 2,
edge_color_digits = 2,
edge_size_digits = 2,
#aspect_ratio = 1.618,
output_file = file.path('demulticoder/figures', '/heattree_mostabund_taxa_rps10_newdb_boot0_greater1.pdf'))
### Community composition plot-new db-showcase differences in bootstrap support values
min_bootstrap <- 0
obj_olddb$data$score$boot <- as.numeric(obj_olddb$data$score$boot)
max_boot <- obj_olddb$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_olddb <- filter_taxa(obj_olddb, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))
obj_olddb$data$asv_prop <- calc_obs_props(obj_olddb, 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 174 columns for 347 observations.
obj_olddb$data$tax_abund <- calc_taxon_abund(obj_olddb, 'abund', cols = metadata$sample_name)
## Summing per-taxon counts from 174 columns for 145 taxa
obj_olddb$data$tax_prop <- calc_taxon_abund(obj_olddb, 'asv_prop', cols = metadata$sample_name)
## Summing per-taxon counts from 174 columns for 145 taxa
obj_olddb$data$tax_data <- calc_n_samples(obj_olddb, 'tax_prop', cols = metadata$sample_name[metadata$sample_type == 'Sample'])
## Calculating number of samples with a value greater than 0 for 162 columns for 145 observations
obj_olddb$data$tax_data$mean_prop <- rowMeans(obj_olddb$data$tax_prop[, metadata$sample_name])
obj_olddb %>%
filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE))
## <Taxmap>
## 145 taxa: ab. Eukaryota ... fp. Lagena_sp
## 145 edges: NA->ab, ab->ac, ac->ad ... az->fn, at->fo, bk->fp
## 7 data sets:
## abund:
## # A tibble: 347 × 177
## taxon_id sequence dada2_tax S1 S10 S100 S101 S102
## <chr> <chr> <chr> <int> <int> <int> <int> <int>
## 1 bl GAAAATCTTTGT… Eukaryot… 11976 1123 0 0 1125
## 2 bm GAAAATCTTTGT… Eukaryot… 0 0 0 0 17
## 3 bn GAAAATCTTTGT… Eukaryot… 0 0 15876 11386 9976
## # ℹ 344 more rows
## # ℹ 169 more variables: S103 <int>, S104 <int>, S105 <int>,
## # S106 <int>, S107 <int>, S108 <int>, S109 <int>, S11 <int>,
## # S110 <int>, S111 <int>, …
## score:
## # A tibble: 2,776 × 4
## taxon_id sequence boot rank
## <chr> <chr> <dbl> <chr>
## 1 ab GAAAATCTTTGTGTCGGTGGTTCAAGTCCGCCTCCAAACA… 100 Doma…
## 2 ac GAAAATCTTTGTGTCGGTGGTTCAAGTCCGCCTCCAAACA… 100 King…
## 3 ad GAAAATCTTTGTGTCGGTGGTTCAAGTCCGCCTCCAAACA… 100 Phyl…
## # ℹ 2,773 more rows
## prop:
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bl 0.409 0.0490 0.912 0 0.589 0 0 0.0384
## 2 bm 0 0 0 0 0 0 0.716 0
## 3 bn 0 0 0 0.0101 0.0194 0.550 0 0
## # ℹ 344 more rows
## # ℹ 166 more variables: S17 <dbl>, S18 <dbl>, S2 <dbl>,
## # S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>,
## # S25 <dbl>, S26 <dbl>, …
## asv_prop:
## # A tibble: 347 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bl 0.409 0.0490 0.912 0 0.589 0 0 0.0384
## 2 bm 0 0 0 0 0 0 0.716 0
## 3 bn 0 0 0 0.0101 0.0194 0.550 0 0
## # ℹ 344 more rows
## # ℹ 166 more variables: S17 <dbl>, S18 <dbl>, S2 <dbl>,
## # S20 <dbl>, S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>,
## # S25 <dbl>, S26 <dbl>, …
## tax_abund:
## # A tibble: 145 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ab 29314 22903 6235 49267 19302 58145 33859 27995 32935
## 2 ac 29314 22903 6235 49267 19302 58145 33859 27995 32935
## 3 ad 29314 22903 6235 49267 19302 58145 33859 27995 32935
## # ℹ 142 more rows
## # ℹ 165 more variables: S18 <dbl>, S2 <dbl>, S20 <dbl>,
## # S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>, S25 <dbl>,
## # S26 <dbl>, S27 <dbl>, …
## tax_prop:
## # A tibble: 145 × 175
## taxon_id S1 S10 S11 S12 S13 S14 S15 S16 S17
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ab 1 1 1 1 1 1 1 1 1
## 2 ac 1 1 1 1 1 1 1 1 1
## 3 ad 1 1 1 1 1 1 1 1 1
## # ℹ 142 more rows
## # ℹ 165 more variables: S18 <dbl>, S2 <dbl>, S20 <dbl>,
## # S21 <dbl>, S22 <dbl>, S23 <dbl>, S24 <dbl>, S25 <dbl>,
## # S26 <dbl>, S27 <dbl>, …
## And 1 more data sets: tax_data
## 0 functions:
# Calculate the relative standard deviation for each taxon as a measure of how consistently it was found.
rsd <- function(x, na.rm = FALSE) {sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)}
obj_olddb$data$tax_data$rel_stand_dev <- map_dbl(1:nrow(obj_olddb$data$tax_abund), function(i) {
rsd(unlist(obj_olddb$data$tax_abund[i, metadata$sample_name]), na.rm = TRUE)
})
max_boot <- obj_olddb$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
# Map the maximum bootstrap support to taxon IDs
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
# Add maximum bootstrap support to tax_data
obj_olddb$data$tax_data$max_boot <- max_boot[match(obj_olddb$data$tax_data$taxon_id, names(max_boot))]
set.seed(10)
obj_olddb %>%
filter_taxa(! is_stem) %>%
filter_taxa(n_samples >= 1, supertaxa = TRUE, reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "_sp$"), reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "incertae_sedis", ignore.case = TRUE), reassign_obs = FALSE) %>%
remove_redundant_names() %>%
heat_tree(node_size = mean_prop,
edge_size = n_samples,
node_color = ifelse(is.na(max_boot), 0, max_boot),
node_color_range = c("#D2042D","#fc8d62","#a6d854","#66c2a5","lightblue","#b8c0cf"),
node_label = taxon_names,
node_size_range = c(0.01, 0.03),
node_label_size_range = c(0.014, 0.02),
edge_label_size_range = c(0.012, 0.015),
node_size_interval = c(0, 1),
edge_size_range = c(0.002, 0.01),
layout = "da", initial_layout = "re",
node_color_axis_label = "Bootstrap support",
node_size_axis_label = "Mean proportion of reads",
edge_size_axis_label = "Number of samples",
node_color_digits = 2,
node_size_digits = 2,
edge_color_digits = 2,
edge_size_digits = 2,
#aspect_ratio = 1.618,
output_file = file.path('demulticoder/figures', '/heattree_mostabund_taxa_rps10_olddb_boot0_greater1.pdf'))
Let’s facet the two plots so we have a side-by-side comparison of the two databases.
## quartz_off_screen
## 2
## Proportion of times new db value is higher than old db value: 0.4092219
## Proportion of times new db value is less than old db value: 0.4899135
## Proportion of times new db value is equal to old db value: 0.1008646
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os macOS Ventura 13.2.1
## system aarch64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Los_Angeles
## date 2025-01-07
## pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 4.3.0)
## ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.0)
## agricolae 1.3-7 2023-10-22 [1] CRAN (R 4.3.1)
## AlgDesign 1.2.1 2022-05-25 [1] CRAN (R 4.3.0)
## ape 5.8 2024-04-11 [1] CRAN (R 4.3.1)
## askpass 1.2.0 2023-09-03 [1] CRAN (R 4.3.0)
## backports 1.5.0 2024-05-23 [1] CRAN (R 4.3.3)
## Biobase 2.62.0 2023-10-26 [1] Bioconductor
## BiocGenerics * 0.48.1 2023-11-02 [1] Bioconductor
## BiocParallel 1.36.0 2023-10-26 [1] Bioconductor
## biomformat 1.30.0 2023-10-26 [1] Bioconductor
## Biostrings * 2.70.3 2024-04-03 [1] bioc_xgit (@c213e35)
## bit 4.5.0.1 2024-12-03 [1] CRAN (R 4.3.3)
## bit64 4.5.2 2024-09-22 [1] CRAN (R 4.3.3)
## bitops 1.0-8 2024-07-29 [1] CRAN (R 4.3.3)
## broom 1.0.6 2024-05-17 [1] CRAN (R 4.3.3)
## bslib 0.8.0 2024-07-29 [1] CRAN (R 4.3.3)
## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.3.2)
## car 3.1-2 2023-03-30 [1] CRAN (R 4.3.0)
## carData 3.0-5 2022-01-06 [1] CRAN (R 4.3.0)
## cli 3.6.3 2024-06-21 [1] CRAN (R 4.3.2)
## cluster 2.1.6 2023-12-01 [1] CRAN (R 4.3.1)
## codetools 0.2-20 2024-03-31 [1] CRAN (R 4.3.1)
## colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.3.3)
## cowplot * 1.1.3 2024-01-22 [1] CRAN (R 4.3.1)
## crayon 1.5.3 2024-06-20 [1] CRAN (R 4.3.3)
## data.table * 1.15.4 2024-03-30 [1] CRAN (R 4.3.1)
## decontam * 1.22.0 2023-10-26 [1] Bioconductor
## DelayedArray 0.28.0 2023-11-06 [1] Bioconductor
## DESeq2 1.42.1 2024-03-09 [1] Bioconductor 3.18 (R 4.3.3)
## digest 0.6.37 2024-08-19 [1] CRAN (R 4.3.3)
## dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
## evaluate 1.0.1 2024-10-10 [1] CRAN (R 4.3.3)
## farver 2.1.2 2024-05-13 [1] CRAN (R 4.3.3)
## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.3.2)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.0)
## furrr * 0.3.1 2022-08-15 [1] CRAN (R 4.3.0)
## future * 1.34.0 2024-07-29 [1] CRAN (R 4.3.3)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
## GenomeInfoDb * 1.38.8 2024-03-16 [1] Bioconductor 3.18 (R 4.3.3)
## GenomeInfoDbData 1.2.11 2023-11-14 [1] Bioconductor
## GenomicRanges 1.54.1 2023-10-30 [1] Bioconductor
## ggfittext 0.10.2 2024-02-01 [1] CRAN (R 4.3.1)
## ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1)
## ggpubr 0.6.0 2023-02-10 [1] CRAN (R 4.3.0)
## ggsignif 0.6.4 2022-10-13 [1] CRAN (R 4.3.0)
## globals 0.16.3 2024-03-08 [1] CRAN (R 4.3.1)
## glue 1.8.0 2024-09-30 [1] CRAN (R 4.3.3)
## gridExtra * 2.3 2017-09-09 [1] CRAN (R 4.3.0)
## gtable 0.3.6 2024-10-25 [1] CRAN (R 4.3.3)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0)
## htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1)
## igraph 2.0.3 2024-03-13 [1] CRAN (R 4.3.1)
## IRanges * 2.36.0 2023-10-26 [1] Bioconductor
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.0)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.3.0)
## jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1)
## knitr 1.49 2024-11-08 [1] CRAN (R 4.3.3)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0)
## lattice * 0.22-6 2024-03-20 [1] CRAN (R 4.3.1)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.3.0)
## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
## listenv 0.9.1 2024-01-29 [1] CRAN (R 4.3.1)
## locfit 1.5-9.10 2024-06-24 [1] CRAN (R 4.3.3)
## magick * 2.8.4 2024-07-14 [1] CRAN (R 4.3.3)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
## MASS 7.3-60.0.1 2024-01-13 [1] CRAN (R 4.3.1)
## Matrix 1.6-5 2024-01-11 [1] CRAN (R 4.3.1)
## MatrixGenerics 1.14.0 2023-10-26 [1] Bioconductor
## matrixStats 1.3.0 2024-04-11 [1] CRAN (R 4.3.1)
## metacoder * 0.3.7 2024-02-20 [1] CRAN (R 4.3.1)
## mgcv 1.9-1 2023-12-21 [1] CRAN (R 4.3.1)
## multtest 2.58.0 2023-10-26 [1] Bioconductor
## munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1)
## nlme 3.1-165 2024-06-06 [1] CRAN (R 4.3.3)
## parallelly 1.41.0 2024-12-18 [1] CRAN (R 4.3.3)
## pdftools * 3.4.1 2024-09-20 [1] CRAN (R 4.3.3)
## permute * 0.9-7 2022-01-27 [1] CRAN (R 4.3.0)
## phyloseq * 1.46.0 2024-04-03 [1] bioc_xgit (@7320133)
## pillar 1.10.0 2024-12-17 [1] CRAN (R 4.3.3)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.3.1)
## purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
## qpdf 1.3.4 2024-10-04 [1] CRAN (R 4.3.3)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
## ragg 1.3.2 2024-05-15 [1] CRAN (R 4.3.3)
## Rcpp 1.0.13 2024-07-17 [1] CRAN (R 4.3.2)
## RCurl 1.98-1.16 2024-07-11 [1] CRAN (R 4.3.3)
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.1)
## reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.3.0)
## rhdf5 2.46.1 2023-12-02 [1] Bioconductor 3.18 (R 4.3.2)
## rhdf5filters 1.14.1 2023-11-06 [1] Bioconductor
## Rhdf5lib 1.24.2 2024-02-10 [1] Bioconductor 3.18 (R 4.3.2)
## rlang 1.1.4 2024-06-04 [1] CRAN (R 4.3.2)
## rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.3.3)
## rstatix 0.7.2 2023-02-01 [1] CRAN (R 4.3.0)
## rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1)
## S4Arrays 1.2.1 2024-03-05 [1] Bioconductor 3.18 (R 4.3.2)
## S4Vectors * 0.40.2 2023-11-25 [1] Bioconductor 3.18 (R 4.3.2)
## sass 0.4.9 2024-03-15 [1] CRAN (R 4.3.1)
## scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.1)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
## SparseArray 1.2.4 2024-02-10 [1] Bioconductor 3.18 (R 4.3.2)
## stringi 1.8.4 2024-05-06 [1] CRAN (R 4.3.2)
## stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1)
## SummarizedExperiment 1.32.0 2023-11-06 [1] Bioconductor
## survival 3.7-0 2024-06-05 [1] CRAN (R 4.3.3)
## svglite 2.1.3 2023-12-08 [1] CRAN (R 4.3.1)
## systemfonts 1.1.0 2024-05-15 [1] CRAN (R 4.3.3)
## textshaping 0.4.0 2024-05-24 [1] CRAN (R 4.3.3)
## tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
## tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.0)
## utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
## vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
## vegan * 2.6-6.1 2024-05-21 [1] CRAN (R 4.3.3)
## viridis 0.6.5 2024-01-29 [1] CRAN (R 4.3.1)
## viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.3.0)
## vroom 1.6.5 2023-12-05 [1] CRAN (R 4.3.1)
## withr 3.0.2 2024-10-28 [1] CRAN (R 4.3.3)
## xfun 0.49 2024-10-31 [1] CRAN (R 4.3.3)
## XVector * 0.42.0 2023-10-26 [1] Bioconductor
## yaml 2.3.10 2024-07-26 [1] CRAN (R 4.3.3)
## zlibbioc 1.48.2 2024-03-19 [1] Bioconductor 3.18 (R 4.3.3)
##
## [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
##
## ──────────────────────────────────────────────────────────────────────────────